{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# group mechanics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Series and DataFrame "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# split apply combine\n",
    "df = pd.DataFrame(data={\n",
    "    'key1': ['a','b','a','a','b'],\n",
    "    'key2': ['one','two','two','one','two'],\n",
    "    'data1': np.random.randn(5),\n",
    "    'data2': np.random.randn(5)\n",
    "})\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# s1 = df['data1']\n",
    "# grouped = s1.groupby(df['key1'])\n",
    "grouped1 = df['data1'].groupby(df['key1'])\n",
    "grouped1.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# give the right length: 5\n",
    "t1 = ['h', 'j', 'k', 'k', 'h']\n",
    "grouped2 = df['data1'].groupby(t1)\n",
    "grouped2.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped3 = df['data1'].groupby([df['key1'], df['key2']])\n",
    "grouped3.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped3.mean().unstack()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped4 = df['data1'].groupby([df['key1'], df['key2'], t1])\n",
    "grouped4.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped4.mean().unstack()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped5 = df.groupby(['key1'])\n",
    "grouped5.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped5.size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped6 = df.groupby(['key1', 'key2'])\n",
    "grouped6.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped6.size()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. iterating over group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for name, group in df.groupby(['key1']):\n",
    "    print(name)\n",
    "    print(group)\n",
    "\n",
    "for (k1, k2), group in df.groupby(['key1', 'key2']):\n",
    "    print(k1, k2)\n",
    "    print(group)\n",
    "    \n",
    "l1 = list(df.groupby(['key1']))\n",
    "print(l1)\n",
    "d1 = dict(l1)\n",
    "# is-a DataFrame type\n",
    "print('type:', type(d1['a']))\n",
    "\n",
    "# episode\n",
    "l2 = [('aa', 12), ('bb', 13)]\n",
    "d2 = dict(l2)\n",
    "print(d2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. other group operation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = pd.DataFrame(\n",
    "    data=np.random.randn(5,5),\n",
    "    index=['east', 'west', 'north', 'south', 'center'],\n",
    "    columns=['a', 'b', 'c', 'd', 'e']\n",
    ")\n",
    "\n",
    "df2.loc[2:3, ['b','c']] = np.nan\n",
    "\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mapping = {\n",
    "    'a': 'red',\n",
    "    'b': 'blue',\n",
    "    'c': 'red',\n",
    "    'd': 'red',\n",
    "    'e': 'blue',\n",
    "    'f': 'yellow'\n",
    "}\n",
    "by_columns = df2.groupby(by=mapping, axis=1)\n",
    "by_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dt = dict(list(by_columns))\n",
    "dt['red']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(-0.477009+0.767283-0.773931)\n",
    "by_columns.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print((-0.477009+0.767283-0.773931)/3)\n",
    "by_columns.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1 = pd.Series(data=mapping)\n",
    "s1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2.groupby(by=s1, axis=1).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(df)\n",
    "idx_len_by_rows = df2.groupby(by=len)\n",
    "list(idx_len_by_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# must right row length: 5\n",
    "idx_len_other_by_rows = df2.groupby(by=[len, ['o1', 'o2', 'o2', 'o2', 'o1']], axis=0)\n",
    "idx_len_other_by_rows.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols = pd.MultiIndex.from_arrays([['JP', 'JP', 'JP', 'US', 'US'], [1,1,5,1,3]], names=['cty', 'tenor'])\n",
    "print(type(cols))\n",
    "print(cols)\n",
    "df3 = pd.DataFrame(\n",
    "    data=np.random.randn(4,5),\n",
    "    columns=cols\n",
    ")\n",
    "\n",
    "# MultiIndex: \n",
    "#   labels:\n",
    "#         the dim-1 is [0:JP, 1:US]\n",
    "#         the dim-2 is [0:1, 2:5, 1:3]\n",
    "\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# episode for multiindex\n",
    "# m-1\n",
    "idx1 = pd.Index([('A', 'x1'),('A', 'x2'), ('B', 'y1'), ('B', 'y2'), ('B', 'y3')], names=['c1', 'c2'])\n",
    "idx1\n",
    "df4 = pd.DataFrame(\n",
    "    data=np.random.randint(1,10,(5,4)), index=idx1\n",
    ")\n",
    "df4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# m-2\n",
    "c1 = ['A', 'A', 'B', 'B', 'B']\n",
    "c2 = ['x1', 'x2', 'y1', 'y2', 'y3']\n",
    "\n",
    "idx2 = pd.MultiIndex.from_arrays([c1, c2], names=['c1', 'c2'])\n",
    "print(idx2)\n",
    "\n",
    "df5 = pd.DataFrame(\n",
    "    data=np.random.randint(1,10,(5,4)), index=idx2\n",
    ")\n",
    "df5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# m-3\n",
    "idx3 = pd.MultiIndex.from_product([['A', 'B'], ['x1', 'y1']], names=['c1', 'c2'])\n",
    "print(idx3)\n",
    "df6 = pd.DataFrame(\n",
    "    data=np.random.randint(1, 10, (2,4)), columns=idx3\n",
    ")\n",
    "df6"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Aggregation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped = df.groupby('key1')\n",
    "# print(list(grouped['data1']))\n",
    "grouped['data1'].quantile(0.9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def peak_to_peak(arr):\n",
    "    # type is Series\n",
    "    # print(type(arr))\n",
    "    return arr.max() - arr.min()\n",
    "\n",
    "grouped.agg(peak_to_peak)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## column-wise and multiple function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tips = pd.read_csv('./pydata-book/examples/tips.csv')\n",
    "df_tips[-5:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tips['tip_pct'] = df_tips['tip'] / df_tips['total_bill']\n",
    "df_tips.loc[:5,['tip', 'total_bill', 'tip_pct']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped_by_smoker_time = df_tips.groupby(['smoker', 'time'])\n",
    "grouped_by_smoker_time.agg('mean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tip_pct_of_group = grouped_by_smoker_time['tip_pct']\n",
    "tip_pct_of_group.agg([('mycount','count')])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tip_size_of_group = grouped_by_smoker_time['tip', 'size']\n",
    "tip_size_of_group.agg('count')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tip_size_of_group.agg([('mycount', 'count'), ('mystd', np.std)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tip_size_of_group.agg({'tip': [np.max, np.min], 'size': ['count', 'sum']})"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
